Floormod

逐元素计算两个输入张量的 floor-modulus。

\[\text{output}_i = \text{input0}_i - \lfloor \frac{\text{input0}_i}{\text{input1}_i} \rfloor \cdot \text{input1}_i\]

其中 \(\lfloor \cdot \rfloor\) 表示向下取整 (floor) 操作。

输入:
  • input0 - 第一个输入张量(被除数)的数据地址。

  • input1 - 第二个输入张量(除数)的数据地址。

  • params - 参数打包,格式如下。
    • input0_dims - input0的维度信息。

    • input1_dims - input1的维度信息。

    • output_dims - output的维度信息。

    • strides0 - 输入张量0的步长信息。

    • strides1 - 输入张量1的步长信息。

    • strides_output - 输出张量的步长信息。

    • num_dims - 张量的维度数。

  • core_mask - 核掩码。

输出:
  • output - 输出张量的数据地址,其大小与输入张量相同。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持fp32

  • MT7004 支持fp16, fp32

共享存储版本:

void fp_floor_mod_s(float *input0, float *input1, float *output, long long *params, int core_mask)
void hp_floor_mod_s(half *input0, half *input1, half *output, long long *params, int core_mask)
void dp_floor_mod_s(double *input0, double *input1, double *output, long long *params, int core_mask)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <floormod.h>
 4int main(int argc, char* argv[]) {
 5    float* input0 = (float*)0x81000000;
 6    float* input1 = (float*)0x82000000;
 7    float* output = (float*)0x83000000;
 8    int *strides0 = (int*)0x84000000;
 9    int *strides1 = (int*)0x85000000;
10    int *strides_output = (int*)0x86000000;
11
12    int core_mask = 0b1111;
13
14    // same shape
15    int input0_dims[] = {4, 1, 16};  // 2x2
16    int input1_dims[] = {4, 8, 16};  // 2x2
17    int output_dims[] = {4, 8, 16};      // 2x2
18    int num_dims = 3;
19
20    unsigned long long params[9];
21    params[0] = (unsigned long long)input0_dims;
22    params[1] = (unsigned long long)input1_dims;
23    params[2] = (unsigned long long)output_dims;
24    params[3] = (unsigned long long)strides0;
25    params[4] = (unsigned long long)strides1;
26    params[5] = (unsigned long long)strides_output;
27    params[6] = (unsigned long long)num_dims;
28
29    int total_input0 = get_total_elements(num_dims, input0_dims);
30    int total_input1 = get_total_elements(num_dims, input1_dims);
31    int total_output = get_total_elements(num_dims, output_dims);
32
33    srand(time(0));
34
35    int i;
36    for (i = 0; i < total_input0; ++i) {
37        input0[i] = (float)(rand() % 100) / 10.0f;
38    }
39
40    for (i = 0; i < total_input1; ++i) {
41        input1[i] = (float)(rand() % 100) / 10.0f + 0.01f;
42    }
43
44    fp_floor_mod_s(input0, input1, output, params, core_mask);
45    return 0;
46}

私有存储版本:

void fp_floor_mod_p(float *input0, float *input1, float *output, long long *params)
void hp_floor_mod_p(half *input0, half *input1, half *output, long long *params)
void dp_floor_mod_p(double *input0, double *input1, double *output, long long *params)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <floormod.h>
 4int main(int argc, char* argv[]) {
 5    float* input0 = (float*)0x10010000;
 6    float* input1 = (float*)0x10020000;
 7    float* output = (float*)0x10030000;
 8    int *strides0 = (int*)0x10050000;
 9    int *strides1 = (int*)0x10053000;
10    int *strides_output = (int*)0x10056000;
11
12    // same shape
13    int input0_dims[] = {4, 1, 16};  // 2x2
14    int input1_dims[] = {4, 8, 16};  // 2x2
15    int output_dims[] = {4, 8, 16};      // 2x2
16    int num_dims = 3;
17
18    unsigned long long params[9];
19    params[0] = (unsigned long long)input0_dims;
20    params[1] = (unsigned long long)input1_dims;
21    params[2] = (unsigned long long)output_dims;
22    params[3] = (unsigned long long)strides0;
23    params[4] = (unsigned long long)strides1;
24    params[5] = (unsigned long long)strides_output;
25    params[6] = (unsigned long long)num_dims;
26
27    int total_input0 = get_total_elements(num_dims, input0_dims);
28    int total_input1 = get_total_elements(num_dims, input1_dims);
29    int total_output = get_total_elements(num_dims, output_dims);
30
31    srand(time(0));
32
33    int i;
34    for (i = 0; i < total_input0; ++i) {
35        input0[i] = (float)(rand() % 100) / 10.0f;
36    }
37
38    for (i = 0; i < total_input1; ++i) {
39        input1[i] = (float)(rand() % 100) / 10.0f + 0.01f;
40    }
41
42    fp_floor_mod_p(input0, input1, output, params);
43    return 0;
44}